# Neural Networks

Neural networks are a way of parametrizing non-linear functions. On a very basic level, they are formed by a composition of non-linear function. The functions is defined with a layered architecture. The mapping from the input layer to the output layer is performed via hidden layers. Each layer $k$ produces an output $z_k$ that is a non-linear function of a weighted combination of the outputs of the previous layer, $z_k = g_k(W_k z_{k-1})$. 

Once the architecture and the activation functions $g_k(\cdot)$ are defined, the weights $W_k$ are trained. If all the functions $g_k$ are (sub)-differentiable then, via the chain rule, gradients exist and can be computed. The weights are trained via different variants of gradient descent. 

In [None]:
import numpy as np 
import matplotlib as mpl 
import matplotlib.pyplot as plt 

from sklearn import cluster, datasets, mixture
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.model_selection import train_test_split

import ipywidgets
from ipywidgets import interact, interactive, interact_manual, fixed

from utilities import plot_helpers


%matplotlib inline
%load_ext autoreload
%autoreload 2
from matplotlib import rcParams
rcParams['figure.figsize'] = (10, 5) # Change this if figures look ugly. 
rcParams['font.size'] = 16

import warnings
warnings.filterwarnings("ignore")

## Classification Demo

Neural network training has a lot of hyperparameters. Architecture, learning rate, batch size, optimization algorithm, random seed are just a few of them. Because of non-convexity, 

In [None]:
n_samples = 200

rcParams['figure.figsize'] = (10, 5) # Change this if figures look ugly. 
rcParams['font.size'] = 16
def mlp(dataset, hidden_layer_sizes, activation, solver, reg, noise):
 np.random.seed(42)
 classifier = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, 
 activation=activation,
 solver=solver,
# max_iter=n_iter, 
 alpha=np.power(10., reg),
# verbose=10, 
# tol=1e-4, 
 random_state=1,
 learning_rate_init=.1)

 if dataset is 'blobs':
 X, Y = datasets.make_blobs(n_samples=n_samples, centers=2, random_state=3, cluster_std=10*noise)
 elif dataset is 'circles':
 X, Y = datasets.make_circles(n_samples=n_samples, factor=.5, noise=noise, random_state=42)
 elif dataset is 'moons':
 X, Y = datasets.make_moons(n_samples=n_samples, noise=noise, random_state=42)
 elif dataset == 'xor':
 np.random.seed(42)
 step = int(n_samples/4)
 
 X = np.zeros((n_samples, 2))
 Y = np.zeros(n_samples)
 
 X[0*step:1*step, :] = noise * np.random.randn(step, 2)
 Y[0*step:1*step] = 1
 X[1*step:2*step, :] = np.array([1, 1]) + noise * np.random.randn(step, 2)
 Y[1*step:2*step] = 1
 
 X[2*step:3*step, :] = np.array([0, 1]) + noise * np.random.randn(step, 2)
 Y[2*step:3*step] = -1
 X[3*step:4*step, :] = np.array([1, 0]) + noise * np.random.randn(step, 2)
 Y[3*step:4*step] = -1
 
 elif dataset == 'periodic':
 
 step = int(n_samples/4)
 
 X = np.zeros((n_samples, 2))
 Y = np.zeros(n_samples)
 
 X[0*step:1*step, :] = noise * np.random.randn(step, 2)
 Y[0*step:1*step] = 1
 X[1*step:2*step, :] = np.array([0, 2]) + noise * np.random.randn(step, 2)
 Y[1*step:2*step] = 1
 
 X[2*step:3*step, :] = np.array([0, 1]) + noise * np.random.randn(step, 2)
 Y[2*step:3*step] = -1
 X[3*step:4*step, :] = np.array([0, 3]) + noise * np.random.randn(step, 2)
 Y[3*step:4*step] = -1
 
 X = X[Y <= 1, :]
 Y = Y[Y <=1 ]
 Y[Y==0] = -1
 
 X = StandardScaler().fit_transform(X)
 X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=.4)
 
 classifier.fit(X_train, y_train)
 print(classifier.score(X_test, y_test))
 
 
 # plot the line, the points, and the nearest vectors to the plane
 plt.figure()
 plt.clf()
 fig = plt.axes()
 opt = {'marker': 'r*', 'label': '+'}
 plot_helpers.plot_data(X[np.where(Y == 1)[0], 0], X[np.where(Y == 1)[0], 1], fig=fig, options=opt)
 opt = {'marker': 'bs', 'label': '-'}
 plot_helpers.plot_data(X[np.where(Y == -1)[0], 0], X[np.where(Y == -1)[0], 1], fig=fig, options=opt)

 mins = np.min(X, 0)
 maxs = np.max(X, 0)
 x_min = mins[0] - 1
 x_max = maxs[0] + 1
 y_min = mins[1] - 1
 y_max = maxs[1] + 1

 XX, YY = np.mgrid[x_min:x_max:200j, y_min:y_max:200j] 
 Xplot = np.c_[XX.ravel(), YY.ravel()]
 if hasattr(classifier, "decision_function"):
 Z = classifier.decision_function(Xplot)
 else:
 Z = classifier.predict_proba(Xplot)[:, 1]
 
 # Put the result into a color plot
 Z = Z.reshape(XX.shape)
 # plt.figure(fignum, figsize=(4, 3))
 # Put the result into a color plot
 plt.contourf(XX, YY, Z, cmap=plt.cm.jet, alpha=.3)
 
 
interact(mlp, 
 dataset=['blobs', 'circles', 'moons', 'xor', 'periodic'],
 activation=['logistic', 'relu', 'identity', 'tanh'],
 solver=['sgd', 'adam','lbfgs'],
 hidden_layer_sizes=[(50, ), (100, ), (50, 50), (100, 100), (50, 50, 50), (100, 100, 100)],
 reg=ipywidgets.FloatSlider(value=-3,
 min=-3,
 max=3,
 step=0.1,
 readout_format='.1f',
 description='reg 10^:',
 style={'description_width': 'initial'},
 continuous_update=False),
 noise=ipywidgets.FloatSlider(value=0.05,
 min=0.01,
 max=0.3,
 step=0.01,
 readout_format='.2f',
 description='noise:',
 style={'description_width': 'initial'},
 continuous_update=False), 
 );

## Keras Demo

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten, BatchNormalization
from keras.layers import Conv2D, MaxPooling2D
from keras.datasets import mnist
from keras import backend as K


In [None]:
np.random.seed(123) # for reproducibility

batch_size = 128
num_classes = 10
epochs = 6

# input image dimensions
img_rows, img_cols = 28, 28

# 1. Load pre-shuffled MNIST data into train and test sets
(x_train, y_train), (x_test, y_test) = mnist.load_data()

if K.image_data_format() == 'channels_first':
 input_shape = (1, img_rows, img_cols)
else:
 input_shape = (img_rows, img_cols, 1)
 
# 3. Preprocess class labels
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

# 4. Define model architecture

ANN = Sequential()
ANN.name = 'ANN'
ANN.add(Dense(512, activation='relu', input_shape=(784,)))
# ANN.add(BatchNormalization(axis=-1, momentum=0.99, epsilon=0.001, center=True, scale=True, beta_initializer='zeros', gamma_initializer='ones', moving_mean_initializer='zeros', moving_variance_initializer='ones', beta_regularizer=None, gamma_regularizer=None, beta_constraint=None, gamma_constraint=None))
# ANN.add(Dropout(0.2))
ANN.add(Dense(512, activation='relu'))
# ANN.add(BatchNormalization(axis=-1, momentum=0.99, epsilon=0.001, center=True, scale=True, beta_initializer='zeros', gamma_initializer='ones', moving_mean_initializer='zeros', moving_variance_initializer='ones', beta_regularizer=None, gamma_regularizer=None, beta_constraint=None, gamma_constraint=None))
# ANN.add(Dropout(0.2))
ANN.add(Dense(num_classes, activation='softmax'))

model = ANN

x_train = x_train.reshape(x_train.shape[0], 784)
x_test = x_test.reshape(x_test.shape[0], 784)
 
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')


model.summary()
model.compile(loss='categorical_crossentropy',
 optimizer=keras.optimizers.Adadelta(),
 metrics=['accuracy'])

try:
 history = model.fit(x_train, y_train,
 batch_size=batch_size,
 epochs=epochs,
 verbose=1,
 validation_data=(x_test, y_test))
except KeyboardInterrupt:
 pass
score = model.evaluate(x_test, y_test, verbose=0)
print('')
print('Test loss:', score[0])
print('Test accuracy:', score[1])


In [None]:
np.random.seed(123) # for reproducibility

batch_size = 128
num_classes = 10
epochs = 6

# input image dimensions
img_rows, img_cols = 28, 28

# 1. Load pre-shuffled MNIST data into train and test sets
(x_train, y_train), (x_test, y_test) = mnist.load_data()

if K.image_data_format() == 'channels_first':
 input_shape = (1, img_rows, img_cols)
else:
 input_shape = (img_rows, img_cols, 1)
 
# 3. Preprocess class labels
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

# 4. Define model architecture
CNN = Sequential()
CNN.name = 'CNN'
CNN.add(Conv2D(32, kernel_size=(3, 3),
 activation='relu',
 input_shape=input_shape))
CNN.add(Conv2D(64, (3, 3), activation='relu'))
CNN.add(MaxPooling2D(pool_size=(2, 2)))
CNN.add(Dropout(0.25))
CNN.add(Flatten())
CNN.add(Dense(128, activation='relu'))
CNN.add(Dropout(0.5))
CNN.add(Dense(num_classes, activation='softmax'))


ANN = Sequential()
ANN.name = 'ANN'
ANN.add(Dense(512, activation='relu', input_shape=(784,)))
ANN.add(Dropout(0.2))
ANN.add(Dense(512, activation='relu'))
ANN.add(Dropout(0.2))
ANN.add(Dense(num_classes, activation='softmax'))

models = [CNN]

for model in models:
 # 2. Preprocess input data
 if model.name == 'ANN':
 x_train = x_train.reshape(x_train.shape[0], 784)
 x_test = x_test.reshape(x_test.shape[0], 784)
 elif model.name == 'CNN':
 x_train = x_train.reshape(x_train.shape[0], *input_shape)
 x_test = x_test.reshape(x_test.shape[0], *input_shape)
 
 x_train = x_train.astype('float32')
 x_test = x_test.astype('float32')
 x_train /= 255
 x_test /= 255
 print('x_train shape:', x_train.shape)
 print(x_train.shape[0], 'train samples')
 print(x_test.shape[0], 'test samples')


 model.summary()
 model.compile(loss='categorical_crossentropy',
 optimizer=keras.optimizers.Adadelta(),
 metrics=['accuracy'])
 try:
 history = model.fit(x_train, y_train,
 batch_size=batch_size,
 epochs=epochs,
 verbose=1,
 validation_data=(x_test, y_test))
 except KeyboardInterrupt:
 pass
 
 score = model.evaluate(x_test, y_test, verbose=0)
 print('Test loss:', score[0])
 print('Test accuracy:', score[1])



## Stochastic Learning strategies

In [None]:
# different learning rate schedules and momentum parameters
params = [{'solver': 'sgd', 'learning_rate': 'constant', 'momentum': 0,
 'learning_rate_init': 0.2},
 {'solver': 'sgd', 'learning_rate': 'constant', 'momentum': .9,
 'nesterovs_momentum': False, 'learning_rate_init': 0.2},
 {'solver': 'sgd', 'learning_rate': 'constant', 'momentum': .9,
 'nesterovs_momentum': True, 'learning_rate_init': 0.2},
 {'solver': 'sgd', 'learning_rate': 'invscaling', 'momentum': 0,
 'learning_rate_init': 0.2},
 {'solver': 'sgd', 'learning_rate': 'invscaling', 'momentum': .9,
 'nesterovs_momentum': True, 'learning_rate_init': 0.2},
 {'solver': 'sgd', 'learning_rate': 'invscaling', 'momentum': .9,
 'nesterovs_momentum': False, 'learning_rate_init': 0.2},
 {'solver': 'adam', 'learning_rate_init': 0.01}]

labels = ["constant learning-rate", "constant with momentum",
 "constant with Nesterov's momentum",
 "inv-scaling learning-rate", "inv-scaling with momentum",
 "inv-scaling with Nesterov's momentum", "adam"]

plot_args = [{'c': 'red', 'linestyle': '-'},
 {'c': 'green', 'linestyle': '-'},
 {'c': 'blue', 'linestyle': '-'},
 {'c': 'red', 'linestyle': '--'},
 {'c': 'green', 'linestyle': '--'},
 {'c': 'blue', 'linestyle': '--'},
 {'c': 'black', 'linestyle': '-'}]

def plot_on_dataset(dataset):
 # Load datasets. 
 plt.figure()
 max_iter = 400
 if dataset == "iris":
 data = datasets.load_iris()
 X = data.data
 y = data.target
 elif dataset == "digits":
 data = datasets.load_digits()
 X = data.data
 y = data.target
 max_iter = 15
 elif dataset == "circles":
 X, y = datasets.make_circles(noise=0.2, factor=0.5, random_state=1)
 elif dataset == 'moons':
 X, y = datasets.make_moons(noise=0.3, random_state=0)
 X = MinMaxScaler().fit_transform(X)
 
 # Train Classifiers.
 classifiers = []
 for label, param in zip(labels, params):
 classifier = MLPClassifier(verbose=0, 
 random_state=0,
 max_iter=max_iter, **param)
 classifier.fit(X, y)
 classifiers.append(classifier)
 for classifier, label, args in zip(classifiers, labels, plot_args):
 plt.plot(classifier.loss_curve_, label=label, **args)
 
 plt.legend(ncol=2, loc="best")
 plt.xlabel('iterations')
 plt.ylabel('Error')

interact(plot_on_dataset, dataset=['iris', 'digits', 'circles', 'moons']);

# Universal function Aproximator

In [None]:
from sklearn import svm

def laplacian_kernel(X, Y, bw):
 rows = X.shape[0]
 cols = Y.shape[0]
 K = np.zeros((rows, cols))
 for col in range(cols):
 dist = bw * np.linalg.norm(X - Y[col, :], ord=1, axis=1)
 K[:, col] = np.exp(-dist)
 return K

def process_regressor(regressor, xtrain, ytrain, xplot, yplot):
 regressor.fit(np.reshape(xtrain, (xtrain.size, 1)), ytrain)

 yhat = regressor.predict(np.reshape(xplot, (xplot.size, 1)))


 plt.scatter(xtrain, ytrain, label="Training data", alpha=0.2)
 plt.plot(xplot, yplot, 'r-', label="True Function")
 plt.plot(xplot, yhat, 'g-', label="Prediction")

 plt.legend(loc='lower center');
 plt.ylim([np.min(yplot)*1.1, np.max(yplot)*1.1])

def NNregressor(activation, solver, hidden_layer_size, reg, xtrain, ytrain, xplot, yplot):
 regressor = MLPRegressor(activation=activation,
 solver=solver,
 alpha=reg,
 random_state=0,
 hidden_layer_sizes=hidden_layer_size,
 tol=1e-6,
 max_iter=1000
 )
 process_regressor(regressor, xtrain, ytrain, xplot, yplot)

def SVMregressor(kernel, bw, reg, xtrain, ytrain, xplot, yplot):
 if kernel == 'rbf':
 gamma = np.power(10., -bw)
 coef0 = 0
 elif kernel == 'laplacian':
 gamma = np.power(10., -bw)
 coef0 = 0
 kernel = lambda X, Y: laplacian_kernel(X, Y, gamma)
 
 regressor = svm.SVR(kernel=kernel, C=1./reg, gamma=gamma,coef0=coef0)
 process_regressor(regressor, xtrain, ytrain, xplot, yplot)

 
def uat_demo(function, n_samples, noise, family):
 if function == 1:
 f = lambda x: np.sin(x) 
 elif function == 2:
 f = lambda x: np.sin(x) * np.exp(np.abs(x))
 elif function == 3:
 f = lambda x: np.sin(x) * np.floor(np.abs(x))
 elif function == 4:
 f = lambda x: np.sin(x * np.floor(np.abs(x)))

 xmin = -6
 xmax = +6
 xplot = np.arange(xmin, xmax, 0.01)
 yplot = f(xplot)

 xtrain = xmin + (xmax -xmin) * np.random.rand(n_samples)
 ytrain = f(xtrain) + noise * np.random.randn(xtrain.size)
 
 if family == 'NN':
 regressor = interact(
 NNregressor,
 solver=['lbfgs', 'sgd', 'adam'],
 activation=['relu', 'identity', 'logistic'],
 hidden_layer_size=[(1,), (5, ), (50, ), (100, ), (1000, ),
 (5, 5, ), (50, 50, ), (100, 100), 
 (50, 50, 50), (100, 100, 100)],
 reg=[0, 10**-3, 10**-2, 10**-1, 1], 
 xtrain=fixed(xtrain), 
 ytrain=fixed(ytrain), 
 xplot=fixed(xplot), 
 yplot=fixed(yplot))
 
 elif family == 'SVM':
 regressor = interact(
 SVMregressor,
 kernel=['rbf', 'laplacian'],
 bw=ipywidgets.FloatSlider(value=-1,
 min=-3,
 max=3,
 step=0.1,
 readout_format='.1f',
 description='Bandwidth 10^:',
 style={'description_width': 'initial'},
 continuous_update=False),
 reg=[10**-3, 10**-2, 10**-1, 1], 
 xtrain=fixed(xtrain), 
 ytrain=fixed(ytrain), 
 xplot=fixed(xplot), 
 yplot=fixed(yplot))

interact(uat_demo, 
 n_samples=[100, 200, 500, 1000, 10000],
 noise=[0, 0.01, 0.05, 0.1, 0.5,],
 function=ipywidgets.ToggleButtons(value=1, 
 options=[1, 2, 3, 4], 
 description='Function:',
 style={'description_width': 'initial'}),
 family=['NN', 'SVM']
 );